library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.5 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse() masks nlme::collapse()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: rpart
## Registered S3 method overwritten by 'spatstat':
## method from
## print.boxx cli
##
## spatstat 1.64-1 (nickname: 'Help you I can, yes!')
## For an introduction to spatstat, type 'beginner'
##
## Note: spatstat version 1.64-1 is out of date by more than 8 months; we recommend upgrading to the latest version.
devtools::load_all("chigcrim/")
## Loading chigcrim
##
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
##
## matches
## The following object is masked from 'package:purrr':
##
## is_null
## The following object is masked from 'package:tidyr':
##
## matches
For the training data we will use historical data from 2016 - 2018. We will then predict on the data from 2019 to test the model.
# Load training data
crime_2016 <- load_data(2016, strings_as_factors = TRUE)
crime_2017 <- load_data(2017, strings_as_factors = TRUE)
crime_2018 <- load_data(2018, strings_as_factors = TRUE)
# bind_rows faster than rbind
crime_train <- bind_rows(crime_2016, crime_2017, crime_2018)
# Load test data
crime_2019 <- load_data(2019, strings_as_factors = TRUE)
crime_test <- crime_2019
It is then necessary to prepare the data for its’ use within the Generalised Additive Models.
# Extract useful information from the date
# Include week for later use
crime_train %<>%
mutate(month = factor(month(date)),
week = factor(week(date)),
day = factor(day(date)),
hour = factor(hour(date)),
yday = yday(date),
date = date(date))
crime_test %<>%
mutate(month = factor(month(date)),
week = factor(month(date)),
day = factor(day(date)),
hour = factor(hour(date)),
yday = yday(date),
date = date(date))
# List of features to use in GAM
keep_features <- c("month", "week", "year", "day", "date", "community_area",
"fbi_code", "yday")
# Prep data for use in GAM
# Use all_of(keep_features) to suppress warning message
gam_train <- crime_train %>% select(all_of(keep_features)) %>% na.omit() %>%
mutate(day = as.numeric(day))
gam_test <- crime_test %>% select(all_of(keep_features)) %>% na.omit() %>%
mutate(day = as.numeric(day))
gam_train
## # A tibble: 806,492 x 8
## month week year day date community_area fbi_code yday
## <fct> <fct> <int> <dbl> <date> <int> <fct> <dbl>
## 1 1 1 2016 2 2016-01-02 12 26 2
## 2 1 2 2016 11 2016-01-11 1 06 11
## 3 1 2 2016 11 2016-01-11 38 07 11
## 4 1 2 2016 13 2016-01-13 2 08B 13
## 5 1 1 2016 6 2016-01-06 48 26 6
## 6 1 2 2016 10 2016-01-10 48 26 10
## 7 1 3 2016 15 2016-01-15 19 08B 15
## 8 1 3 2016 18 2016-01-18 74 03 18
## 9 1 2 2016 13 2016-01-13 19 08B 13
## 10 1 3 2016 21 2016-01-21 19 08B 21
## # … with 806,482 more rows
We are interested in predicting the daily number of crimes, each row represents a crime thus we need to sum the rows for each day to get the number of crimes. We can use the count function to achieve this. Later we will look at breaking down the data further to predict how many of each type of crime (by FBI code) will occur in each commmunity area on a given day.
# Quicker than aggregate
train_dat <- gam_train %>% count(yday, year, date) %>% arrange(year, yday)
test_dat <- gam_test %>% count(yday, year, date) %>% arrange(year, yday)
We now build the GAMs using the mgcv package.
gam1 <- gam(n ~ s(as.numeric(yday), bs = "cc") + year, data = train_dat,
family = "poisson")
plot(train_dat$n, predict(gam1, type = "response"))
abline(a = 0, b = 1, col = "red")
plot(train_dat$date, train_dat$n)
points(train_dat$date, predict(gam1, type = "response"), col = "red")
all_dat <- rbind(train_dat, test_dat)
plot(all_dat$date, all_dat$n, xlab = "Date", ylab = "Daily crimes")
points(all_dat$date, predict(gam1, type = "response", newdata = all_dat),
col = c(rep("red", nrow(train_dat)), rep("blue", nrow(test_dat))))
abline(v = date("2019-01-01"), lty = "dashed")
There are several features that may be of interest, whether a day is a weekend or not, the number of crimes over the previous month, the number of crimes on the same day of the previous year.
add_prev_day <- function(dat, start_date) {
dat$n_pre <- left_join(data.frame(date = dat$date - days(1)), dat, by = "date")$n
dat$n_pre[is.na(dat$n_pre)] <- 0
dat$n_pre[dat$date == start_date] <- NA
return(dat)
}
add_prev_week <- function(dat, start_date) {
dat$n_pre <- left_join(data.frame(date = dat$date - days(1)), dat, by = "date")$n
dat$n_pre[is.na(dat$n_pre)] <- 0
dat$n_pre[dat$date == start_date] <- NA
return(dat)
}
add_dow <- function(dat) {
dat$dow <- wday(dat$date)
return(dat)
}
add_is_fom <- function(dat) {
dat$is_fom <- mday(dat$date) == 1
return(dat)
}
add_is_christmas <- function(dat) {
dat$is_christmas <- format(dat$date, "%d-%m") == "25-12" |
format(dat$date, "%d-%m") == "24-12" |
format(dat$date, "%d-%m") == "26-12"
return(dat)
}
add_is_NYD <- function(dat) {
dat$is_NYD <- format(dat$date, "%d-%m") == "01-01"
return(dat)
}
add_month <- function(dat) {
dat$month <- month(dat$date)
return(dat)
}
add_is_jan <- function(dat) {
dat$is_jan <- month(dat$date) == 1
return(dat)
}
add_week <- function(dat) {
dat$week <- week(dat$date)
return(dat)
}
train_dat <- add_prev_day(train_dat, "2016-01-01")
train_dat <- add_dow(train_dat)
train_dat <- add_is_fom(train_dat)
train_dat <- add_is_christmas(train_dat)
train_dat <- add_is_NYD(train_dat)
train_dat <- add_month(train_dat)
train_dat <- add_is_jan(train_dat)
train_dat <- add_week(train_dat)
gam2 <- gam(n ~ s(as.numeric(yday), bs = "cc") + n_pre, data = train_dat, family = "poisson")
plot(train_dat$date, train_dat$n, xlim = c(date("2016-02-01"), date("2016-02-28")))
points(train_dat$date[-1], predict(gam2, type = "response"), col = "red")
plot(train_dat$date, train_dat$n)
points(train_dat$date[-1], predict(gam2, type = "response"), col = "red")
gam3 <- gam(n ~ s(as.numeric(yday), bs = "cc") + s(as.numeric(dow), bs = "cr", k = 5) + n_pre, data = train_dat, family = "poisson")
plot(train_dat$date, train_dat$n, xlim = c(date("2016-02-01"), date("2016-02-28")))
points(train_dat$date[-1], predict(gam3, type = "response"), col = "red")
plot(train_dat$date, train_dat$n)
points(train_dat$date[-1], predict(gam3, type = "response"), col = "red")
gam4 <- gam(n ~ s(as.numeric(yday), bs = "cc") + as.factor(dow) + n_pre, data = train_dat, family = "poisson")
plot(train_dat$date, train_dat$n, xlim = c(date("2017-12-01"), date("2018-02-28")))
points(train_dat$date[-1], predict(gam4, type = "response"), col = "red")
plot(train_dat$date, train_dat$n)
points(train_dat$date[-1], predict(gam4, type = "response"), col = "red")
gam5 <- gam(n ~ s(as.numeric(yday), bs = "cc") + as.factor(dow) + n_pre + is_fom + is_christmas +
is_jan + is_NYD + as.factor(week), data = train_dat, family = "poisson")
plot(train_dat$date, train_dat$n, xlim = c(date("2016-02-01"), date("2016-02-28")))
points(train_dat$date[-1], predict(gam5, type = "response"), col = "red")
plot(train_dat$date, train_dat$n)
points(train_dat$date[-1], predict(gam5, type = "response"), col = "red")
plot(train_dat$date[-1], train_dat$n[-1] - predict(gam5, type = "response"))
library(sf)
## Linking to GEOS 3.9.0, GDAL 3.2.0, PROJ 7.2.0
# Load the community areas boundaries
com_bounds <- st_read("data/community_areas.shp", quiet = TRUE)
com_bounds %<>% arrange(as.integer(area_numbe))
com_bounds
## Simple feature collection with 77 features and 9 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -87.94011 ymin: 41.64454 xmax: -87.52414 ymax: 42.02304
## geographic CRS: WGS84(DD)
## First 10 features:
## area area_num_1 area_numbe comarea comarea_id community perimeter
## 1 0 1 1 0 0 ROGERS PARK 0
## 2 0 2 2 0 0 WEST RIDGE 0
## 3 0 3 3 0 0 UPTOWN 0
## 4 0 4 4 0 0 LINCOLN SQUARE 0
## 5 0 5 5 0 0 NORTH CENTER 0
## 6 0 6 6 0 0 LAKE VIEW 0
## 7 0 7 7 0 0 LINCOLN PARK 0
## 8 0 8 8 0 0 NEAR NORTH SIDE 0
## 9 0 9 9 0 0 EDISON PARK 0
## 10 0 10 10 0 0 NORWOOD PARK 0
## shape_area shape_len geometry
## 1 51259902 34052.40 MULTIPOLYGON (((-87.65456 4...
## 2 98429095 43020.69 MULTIPOLYGON (((-87.68465 4...
## 3 65095643 46972.79 MULTIPOLYGON (((-87.64102 4...
## 4 71352328 36624.60 MULTIPOLYGON (((-87.67441 4...
## 5 57054168 31391.67 MULTIPOLYGON (((-87.67336 4...
## 6 87214799 51973.10 MULTIPOLYGON (((-87.64102 4...
## 7 88316400 49478.43 MULTIPOLYGON (((-87.63182 4...
## 8 76675896 57293.16 MULTIPOLYGON (((-87.62446 4...
## 9 31636314 25937.23 MULTIPOLYGON (((-87.80676 4...
## 10 121959105 80368.37 MULTIPOLYGON (((-87.78002 4...
# Create a neighbours list
com_nb <- spdep::poly2nb(com_bounds, row.names = com_bounds$community)
names(com_nb) <- attr(com_nb, "region.id")
coords <- st_coordinates(st_centroid(st_geometry(com_bounds)))
## Warning in st_centroid.sfc(st_geometry(com_bounds)): st_centroid does not give
## correct centroids for longitude/latitude data
plot(st_geometry(com_bounds), border = "grey")
plot(com_nb, coords, add = TRUE)
# Construct training data set
spatial_dat <- gam_train %>%
mutate(community_area = factor(community_area)) %>%
count(week, year, community_area) %>%
arrange(year, week, community_area)
# Train GAM
ctrl <- gam.control(nthreads = 8)
gam_spat <- gam(n ~ s(as.numeric(week), bs = "cc") +
s(community_area, bs = "mrf", xt = list(nb = com_nb)),
data = spatial_dat, family = "poisson", control = ctrl)
summary(gam_spat)
##
## Family: poisson
## Link function: log
##
## Formula:
## n ~ s(as.numeric(week), bs = "cc") + s(community_area, bs = "mrf",
## xt = list(nb = com_nb))
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.830587 0.001607 2384 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(as.numeric(week)) 7.989 8 7738 <2e-16 ***
## s(community_area) 75.970 76 444841 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.939 Deviance explained = 93.9%
## UBRE = 1.7789 Scale est. = 1 n = 12233
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:chigcrim':
##
## last_plot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_data <- spatial_dat %>% filter(year == 2016, week == 1)
plot_data %<>% mutate(fit = predict(gam_spat, newdata = plot_data, type = "response")) %>%
select(community_area, n, fit)
plot_data <- com_bounds %>% left_join(plot_data, by = c("area_numbe" = "community_area"))
gglayers <- list(ggtitle("Actual Counts of Crime in Chicago"),
colorspace::scale_fill_continuous_sequential(5, palette = "Inferno"),
theme_void(),
guides(fill = guide_colorbar(title = "Crime Count")))
p <- ggplot(data = plot_data) +
geom_sf(aes(fill = n, text = paste0(community, ": ", n))) +
gglayers
## Warning: Ignoring unknown aesthetics: text
p %>% ggplotly(tooltip = "text") %>%
style(hoverlabel = list(bgcolor = "white"),
hoveron = "fill", traces = seq.int(2, length(p$x$data)))
p1 <- ggplot(data = plot_data) +
geom_sf(aes(fill = fit, text = paste0(community, ": ", fit))) +
gglayers
## Warning: Ignoring unknown aesthetics: text
p1 %>% ggplotly(tooltip = "text") %>%
style(hoverlabel = list(bgcolor = "white"),
hoveron = "fill", traces = seq.int(2, length(p$x$data)))